Date Visualization (part 1)

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import os
from IPython.display import display 
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
import plotly.figure_factory as ff
import seaborn as sns
In [3]:
train= pd.read_csv(r'C:\Users\Einav bezalel\PycharmProjects\Datathon2021\input\TrainingWiDS2021.csv')
test= pd.read_csv(r'C:\Users\Einav bezalel\PycharmProjects\Datathon2021\input\UnlabeledWiDS2021.csv')
dictionary= pd.read_csv(r'C:\Users\Einav bezalel\PycharmProjects\Datathon2021\input\DataDictionaryWiDS2021.csv')
In [4]:
train.head()
Out[4]:
Unnamed: 0 encounter_id hospital_id age bmi elective_surgery ethnicity gender height hospital_admit_source ... h1_pao2fio2ratio_max h1_pao2fio2ratio_min aids cirrhosis hepatic_failure immunosuppression leukemia lymphoma solid_tumor_with_metastasis diabetes_mellitus
0 1 214826 118 68.0 22.732803 0 Caucasian M 180.3 Floor ... NaN NaN 0 0 0 0 0 0 0 1
1 2 246060 81 77.0 27.421875 0 Caucasian F 160.0 Floor ... 51.0 51.0 0 0 0 0 0 0 0 1
2 3 276985 118 25.0 31.952749 0 Caucasian F 172.7 Emergency Department ... NaN NaN 0 0 0 0 0 0 0 0
3 4 262220 118 81.0 22.635548 1 Caucasian F 165.1 Operating Room ... 337.0 337.0 0 0 0 0 0 0 0 0
4 5 201746 33 19.0 NaN 0 Caucasian M 188.0 NaN ... NaN NaN 0 0 0 0 0 0 0 0

5 rows × 181 columns

In [5]:
test.head()
Out[5]:
Unnamed: 0 encounter_id hospital_id age bmi elective_surgery ethnicity gender height hospital_admit_source ... h1_arterial_po2_min h1_pao2fio2ratio_max h1_pao2fio2ratio_min aids cirrhosis hepatic_failure immunosuppression leukemia lymphoma solid_tumor_with_metastasis
0 1 144740 10141 72 NaN 0 Caucasian F 152.4 Floor ... NaN NaN NaN 0 0 0 0 0 0 0
1 2 141990 10141 86 NaN 0 Caucasian F 175.3 Emergency Department ... NaN NaN NaN 0 0 0 0 0 0 0
2 3 142038 10141 72 NaN 0 Caucasian F 162.6 Floor ... NaN NaN NaN 0 0 0 0 0 0 0
3 4 138628 10141 66 NaN 0 Caucasian M 177.8 Floor ... NaN NaN NaN 0 0 0 0 0 0 0
4 5 141682 10141 89 NaN 0 Caucasian M 170.2 Direct Admit ... NaN NaN NaN 0 0 0 0 0 0 0

5 rows × 180 columns

In [6]:
dictionary.head(10)
Out[6]:
Category Variable Name Unit of Measure Data Type Description Example
0 identifier encounter_id None integer Unique identifier associated with a patient un... None
1 identifier hospital_id None integer Unique identifier associated with a hospital None
2 demographic age Years numeric The age of the patient on unit admission None
3 demographic bmi kilograms/metres^2 string The body mass index of the person on unit admi... 21.5
4 demographic elective_surgery None binary Whether the patient was admitted to the hospit... 0
5 demographic ethnicity None string The common national or cultural tradition whic... Caucasian
6 demographic gender None string The genotypical sex of the patient F
7 demographic height centimetres numeric The height of the person on unit admission 180
8 demographic hospital_admit_source None string The location of the patient prior to being adm... Home
9 demographic icu_admit_source None string The location of the patient prior to being adm... Operating room
In [7]:
train.shape
Out[7]:
(130157, 181)
In [8]:
test.shape
Out[8]:
(10234, 180)
In [9]:
train.nunique().sort_values(ascending=False)#Count distinct observations over requested axis.defult is 0 (over the column)
# we can see that Unnamed: 0 + encounter_id are completly unique-> need to be dropped
#notice also that readmission_status has only one value (0) so we'll drop it
Out[9]:
Unnamed: 0               130157
encounter_id             130157
bmi                       41453
urineoutput_apache        32958
pre_icu_los_days          10912
                          ...  
apache_post_operative         2
arf_apache                    2
gcs_unable_apache             2
diabetes_mellitus             2
readmission_status            1
Length: 181, dtype: int64
In [10]:
test.nunique().sort_values(ascending=False) #same thing for test
Out[10]:
Unnamed: 0                     10234
encounter_id                   10234
urineoutput_apache              2953
pre_icu_los_days                2656
d1_pao2fio2ratio_max            1306
                               ...  
apache_post_operative              2
arf_apache                         2
gcs_unable_apache                  2
solid_tumor_with_metastasis        2
readmission_status                 1
Length: 180, dtype: int64
In [11]:
#we suspect that hospital_id comes from different distributions in test and train, let's cheack it:
if np.any((train['hospital_id']).isin(test['hospital_id'])):#should give True if at least one value is in test. o.w False
    print("train's hospital_id is in test")
elif np.any((test['hospital_id']).isin(train['hospital_id'])):
    print("test's hospital_id is in train")
else:
    print("hospital_id is completly different between the sets") #if so, we need to drop it
hospital_id is completly different between the sets
In [12]:
train.drop(['Unnamed: 0', 'encounter_id','readmission_status','hospital_id'],axis=1, inplace=True)
test.drop(['Unnamed: 0', 'encounter_id','readmission_status','hospital_id'],axis=1 , inplace=True)
In [13]:
len(dictionary['Variable Name'])
len(train.columns)

#there is one item which is not in train, we want to find it:
variables_of_dict= dictionary['Variable Name']
variables_of_train= train.columns
print(variables_of_dict[~variables_of_dict.isin(variables_of_train)])
0           encounter_id
1            hospital_id
10        icu_admit_type
15    readmission_status
Name: Variable Name, dtype: object
In [14]:
dictionary.drop([0,1,10,15],axis=0, inplace=True) #'encounter_id', hospital_id,icu_admit_type,readmission_status
In [15]:
print((dictionary['Variable Name'] != variables_of_train).sum())#so the columns matches if it's zero
0
In [16]:
target_col='diabetes_mellitus'
train[target_col].value_counts()
Out[16]:
0    102006
1     28151
Name: diabetes_mellitus, dtype: int64
In [17]:
fig = go.Figure()
counts=train[target_col].value_counts(normalize=True)*100
x = ['No Diabetes', 'Diabetes']
y= [int(counts[0]), int(counts[1])]    
color=['rgb(0,0,225)','rgb(255,0,0)' ]

fig.add_trace(go.Bar(x=x, y=y,hovertext=['{:d}%'.format(y[0]), '{:d}%'.format(y[1])], width=0.5,
                    marker=dict(color=color,line_color='rgb(179,179,179)',line_width=2,opacity=0.6)))



fig.update_layout(showlegend=False,
                  title={'text': "Percentage of Patients by Target [%]",
                         'y':0.95,
                         'x':0.5,
                         'xanchor': 'center',
                         'yanchor': 'top'})

fig.update_yaxes(title_text='Percentage [%]', title_standoff = 30)

fig.show()
In [18]:
print('Proportion: ', round(counts[0] /counts[1], 2), ":1")
Proportion:  3.62 :1

Missing Data Visualization

In [19]:
def plot_heatmap(columns_list, null=True):
    df=train[columns_list]
    df = df.iloc[:,[i for i, n in enumerate(np.var(df.isnull(), axis='rows')) if n > 0]] # Remove completely filled or completely empty variables. 
    if null:
        corr_mat = df.isnull().corr()# Create and mask the correlation matrix. Construct the base heatmap.
        title = 'Correlation Matrix of Nulls'
    else:
        corr_mat = df.corr() #NOTICE: IT REMOVES NON NUMERIC FEATURS AUTOMATECLLY!
        title = 'Correlation Matrix of (Null is excluded)'
        
    mask = np.zeros_like(corr_mat,dtype = np.bool) #creats a "False" 2d array of size of corr_mat
    mask[np.triu_indices_from(mask)] = True  #riu_indices_from: Return the indices for the upper-triangle of arr, set it to True because corr_mat is equal(mirror like) 
    corr1=corr_mat.mask(mask)# df.mask(cond,othe=nan):Where cond is False, keep the original value. Where True, replace with corresponding value from other(which is set to nan)
    X = corr1.columns.values
    
    pio.templates.default = "none"  #to disable the defult of the grey background of plotly

    hovertext = np.around(corr1, decimals=2)

    fig = go.Figure()

    heat = go.Heatmap(z=corr1,
                      x=X,
                      y=X,
                      xgap=1, ygap=1,
                      colorscale='RdBu',
                      colorbar_thickness=20,
                      colorbar_ticklen=3,
                      zmid=0)
    
    
    layout = go.Layout(title_text=title, title_x=0.5, 
                       width=750, height=600,
                       xaxis_showgrid=False,
                       yaxis_showgrid=False,
                       yaxis_autorange='reversed',
                          margin=dict(
                                    l=180,
                                    r=50,
                                    b=180,
                                    t=100))

    fig.add_trace(heat) 
    fig.update_layout(layout)
    fig.update_yaxes(tickangle = -20)
    fig.update_xaxes(tickangle = -290)
    fig.show() 
    
    sort_corr=corr1.stack().sort_values(ascending=False).reset_index()
    print(sort_corr[sort_corr[0] > 0.5])# presents all pairs that have "corr-nan" higher than 0.5...
In [20]:
categories= dictionary['Category'].unique()
In [21]:
categories[:-1]
Out[21]:
array(['demographic', 'APACHE covariate', 'vitals', 'labs',
       'labs blood gas', 'APACHE comorbidity'], dtype=object)
In [22]:
for col in categories[:-1]:
    col_names= col.split(" ")
    col_new='_'.join(col_names)
    locals()['_'.join(col_names)] = dictionary['Variable Name'].loc[dictionary['Category'] == col].values

    
demographic
APACHE_covariate
vitals
labs
labs_blood_gas
APACHE_comorbidity
Out[22]:
array(['aids', 'cirrhosis', 'hepatic_failure', 'immunosuppression',
       'leukemia', 'lymphoma', 'solid_tumor_with_metastasis'],
      dtype=object)
In [23]:
#example for demographic:
plot_heatmap(demographic, False)
plot_heatmap(demographic)
  level_0 level_1         0
0  weight     bmi  0.878656
  level_0 level_1         0
0  weight     bmi  0.874653
1  height     bmi  0.673698
In [24]:
#example for vitals:
plot_heatmap(vitals, False)
plot_heatmap(vitals)
                       level_0                    level_1         0
0    d1_diasbp_noninvasive_max              d1_diasbp_max  0.996682
1    d1_diasbp_noninvasive_min              d1_diasbp_min  0.996599
2     h1_sysbp_noninvasive_max               h1_sysbp_max  0.996537
3       h1_mbp_noninvasive_min                 h1_mbp_min  0.996411
4     d1_sysbp_noninvasive_max               d1_sysbp_max  0.996404
..                         ...                        ...       ...
207                 h1_mbp_max        h1_mbp_invasive_max  0.508478
208  h1_diasbp_noninvasive_min     h1_diasbp_invasive_max  0.507668
209      h1_sysbp_invasive_max        h1_mbp_invasive_min  0.506240
210                 h1_mbp_max  d1_diasbp_noninvasive_max  0.503187
211                 h1_mbp_max              d1_diasbp_max  0.501992

[212 rows x 3 columns]
                    level_0                    level_1         0
0    d1_diasbp_invasive_min     d1_diasbp_invasive_max  1.000000
1             h1_diasbp_min              h1_diasbp_max  1.000000
2                d1_mbp_min                 d1_mbp_max  1.000000
3    d1_mbp_noninvasive_min     d1_mbp_noninvasive_max  1.000000
4           d1_resprate_min            d1_resprate_max  1.000000
..                      ...                        ...       ...
253        h1_heartrate_min  h1_diasbp_noninvasive_max  0.531034
254         d1_resprate_min           d1_heartrate_min  0.502095
255         d1_resprate_min           d1_heartrate_max  0.502095
256         d1_resprate_max           d1_heartrate_min  0.502095
257         d1_resprate_max           d1_heartrate_max  0.502095

[258 rows x 3 columns]
In [26]:
pio.templates.default = "plotly" #return to the defult template
In [27]:
train_groupby=train.groupby("gender")
variables=["weight","height"]

for var in variables:  
    locals() ["male_" + var ]= train_groupby[var].get_group('M')
    locals() ["female_" + var ]=train_groupby[var].get_group('F')

colors = ['lightseagreen','indianred']

#the curve is a kde (by defult distribution), can also do norm...probability density is by defult
fig = ff.create_distplot([male_weight.dropna(inplace=False)  , female_weight.dropna(inplace=False)  ], ["Male","Female"], show_hist=False, bin_size=.25,colors=colors, show_rug=False)
fig.update_xaxes(title_text= 'weight [kg]', range=[35, 190])
fig.update_yaxes(title_text='probability density', title_standoff = 30)
fig.update_layout(title_text='Weight Distribution')
fig.show()


#the curve is a kde (by defult distribution), can also do norm...probability density is by defult
fig = ff.create_distplot([male_height.dropna(inplace=False)  , female_height.dropna(inplace=False)  ], ["Male","Female"], show_hist=False, bin_size=.25,colors=colors, show_rug=False)
fig.update_xaxes(title_text= 'height [cm]', range=[135, 196])
fig.update_yaxes(title_text='probability density', title_standoff = 30)
fig.update_layout(title_text='Height Distribution')
fig.show()
In [28]:
fig = make_subplots(rows=1, cols=2, subplot_titles=('Weight [kg]','Height [cm]'))


fig.add_trace(go.Box(y=male_weight, name='Male',
                marker_color = 'lightseagreen',
                boxmean='sd'),row=1, col=1)# represent mean and standard deviation)

fig.add_trace(go.Box(y=female_weight, name = 'Female',
                marker_color = 'indianred',
                boxmean='sd'),row=1, col=1)

fig.add_trace(go.Box(y=male_height, name='Male',
                marker_color = 'lightseagreen',
                boxmean='sd',showlegend=False),row=1, col=2)# represent mean and standard deviation)

fig.add_trace(go.Box(y=female_height, name = 'Female',
                marker_color = 'indianred',
                boxmean='sd',showlegend=False),row=1, col=2)


fig.show()
In [29]:
train["weight"] = np.where((train.height.isna() & (train.gender == 'F')),female_weight.mode() , train["weight"])
train["weight"] = np.where((train.height.isna() & (train.gender == 'M')),male_weight.mode() , train["weight"])

train["height"] = np.where((train.height.isna() & (train.gender == 'F')),female_height.mode() , train["height"])
train["height"] = np.where((train.height.isna() & (train.gender == 'M')),male_height.mode() , train["height"])

test["weight"] = np.where((test.height.isna() & (test.gender == 'F')),female_weight.mode() , test["weight"])
test["weight"] = np.where((test.height.isna() & (test.gender == 'M')),male_weight.mode() , test["weight"])

test["height"] = np.where((test.height.isna() & (test.gender == 'F')),female_height.mode() , test["height"])
test["height"] = np.where((test.height.isna() & (test.gender == 'M')),male_height.mode() , test["height"])
In [30]:
#if all of them  (gender, weight, height) are nans:
train[train.gender.isna() & train.height.isna() & train.weight.isna()]#18 rows
test[test.gender.isna() & test.height.isna() & test.weight.isna()]#0 rows
#we'll drop them
train = train[~(train.gender.isna() & train.height.isna() & train.weight.isna())].reset_index(drop=True)
In [31]:
fig = px.histogram(train, x="age", histnorm='probability density', color=target_col)
fig.update_yaxes(title_text='probability density', title_standoff = 30)
fig.show()
In [32]:
(test["age"].loc[test.age == 0].value_counts()/test.shape[0])*100  #no values
(train["age"].loc[train.age == 0].value_counts()/train.shape[0])*100  #0.023049%

#only 0.023049% in the train are equal to zero, so let's drop them:
train.loc[train.age == 0, 'age'] = np.nan
test.loc[test.age == 0, 'age'] = np.nan
In [33]:
tar= train[target_col].replace([0,1],["No", "Yes"])

fig = px.scatter(train, x="weight", y="bmi", color=tar, 
                 marginal_x="box", marginal_y="box",
                  title="Weight Vs BMI")

fig.update_layout(
    xaxis_title="Weight [kg]",
    yaxis_title="BMI",
    legend_title="Diabese")


fig.show()
In [34]:
#we'll show an example of the distributions of the two classes in the first two columns:      
tar= train[target_col].replace([0,1],["No Diabese", "Diabese"])

fig = px.scatter(train, x="age", y="bmi", color=tar, title="age Vs bmi")


fig.update_layout(
    title="Age Vs. BMI",
    xaxis_title="Age [years]",
    yaxis_title="BMI",
    legend_title="Target")
    
fig.show()
#we can see an overlay between the classes...
#click on the legend to see it separately !
In [ ]: